In [1]:
from pyspark.sql.types import *

schema = StructType([
StructField("x0", DoubleType(), True),
StructField("x1", DoubleType(), True),
StructField("x2", DoubleType(), True),
StructField("x3", DoubleType(), True),
StructField("x4", DoubleType(), True),
StructField("x5", DoubleType(), True),
StructField("x6", DoubleType(), True),
StructField("x7", DoubleType(), True),
StructField("x8", DoubleType(), True),
StructField("x9", DoubleType(), True),
StructField("x10", DoubleType(), True),
StructField("x11", DoubleType(), True),
StructField("x12", DoubleType(), True),
StructField("x13", DoubleType(), True),
StructField("x14", DoubleType(), True),
StructField("x15", DoubleType(), True),
StructField("x16", DoubleType(), True),
StructField("x17", DoubleType(), True),
StructField("x18", StringType(), True),
StructField("x19", DoubleType(), True),
StructField("x20", DoubleType(), True),
StructField("x21", DoubleType(), True),
StructField("x22", DoubleType(), True),
StructField("x23", DoubleType(), True),
StructField("x24", DoubleType(), True),
StructField("x25", DoubleType(), True),
StructField("x26", DoubleType(), True),
StructField("x27", DoubleType(), True),
StructField("x28", DoubleType(), True),
StructField("x29", DoubleType(), True),
StructField("x30", DoubleType(), True),
StructField("x31", DoubleType(), True),
StructField("x32", DoubleType(), True),
StructField("x33", DoubleType(), True),
StructField("x34", DoubleType(), True),
StructField("x35", DoubleType(), True),
StructField("x36", DoubleType(), True),
StructField("x37", DoubleType(), True),
StructField("x38", DoubleType(), True),
StructField("x39", DoubleType(), True),
StructField("x40", DoubleType(), True),
StructField("x41", DoubleType(), True),
StructField("x42", DoubleType(), True),
StructField("x43", DoubleType(), True),
StructField("x44", DoubleType(), True),
StructField("x45", DoubleType(), True),
StructField("x46", DoubleType(), True),
StructField("x47", DoubleType(), True),
StructField("x48", DoubleType(), True),
StructField("x49", DoubleType(), True),
StructField("x50", DoubleType(), True),
StructField("x51", DoubleType(), True),
StructField("x52", DoubleType(), True),
StructField("x53", DoubleType(), True),
StructField("x54", DoubleType(), True),
StructField("x55", DoubleType(), True),
StructField("x56", DoubleType(), True),
StructField("x57", DoubleType(), True),
StructField("x58", DoubleType(), True),
StructField("x59", DoubleType(), True),
StructField("x60", DoubleType(), True),
StructField("x61", DoubleType(), True),
StructField("x62", DoubleType(), True),
StructField("x63", StringType(), True),
StructField("x64", DoubleType(), True),
StructField("x65", DoubleType(), True),
StructField("x66", DoubleType(), True),
StructField("x67", DoubleType(), True),
StructField("x68", DoubleType(), True),
StructField("x69", StringType(), True),
StructField("x70", DoubleType(), True),
StructField("x71", DoubleType(), True),
StructField("x72", DoubleType(), True),
StructField("x73", DoubleType(), True),
StructField("x74", DoubleType(), True),
StructField("x75", DoubleType(), True),
StructField("x76", DoubleType(), True),
StructField("x77", DoubleType(), True),
StructField("x78", DoubleType(), True),
StructField("x79", DoubleType(), True),
StructField("x80", DoubleType(), True),
StructField("x81", DoubleType(), True),
StructField("x82", DoubleType(), True),
StructField("x83", DoubleType(), True),
StructField("x84", DoubleType(), True),
StructField("x85", DoubleType(), True),
StructField("x86", DoubleType(), True),
StructField("x87", DoubleType(), True),
StructField("x88", DoubleType(), True),
StructField("x89", DoubleType(), True),
StructField("x90", DoubleType(), True),
StructField("x91", DoubleType(), True),
StructField("x92", DoubleType(), True),
StructField("x93", DoubleType(), True),
StructField("x94", DoubleType(), True),
StructField("x95", DoubleType(), True),
StructField("x96", DoubleType(), True),
StructField("x97", DoubleType(), True),
StructField("x98", DoubleType(), True),
StructField("x99", DoubleType(), True),
StructField("y", DoubleType(), True)
    ])
In [2]:
sparkSession=SparkSession.builder.master("local[*]").appName("appName").config("spark.sql.warehouse.dir", "./spark-warehouse").getOrCreate()
In [29]:
# Loads data

from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
##############################
# Uncomment for Spark 1.6
# sqlContext = SQLContext(sc)
# train = sqlContext.read.format("csv").schema(schema).option("header", "true").load("my_dataset_train.csv")
# test = sqlContext.read.format("csv").schema(schema).option("header", "true").load("my_dataset_test.csv")
##############################
# Uncomment for Spark 2.1
train = sparkSession.read.format("csv").schema(schema).option("header", "true").load("/Users/idownard/development/snape/my_dataset_train.csv")
test = sparkSession.read.format("csv").schema(schema).option("header", "true").load("/Users/idownard/development/snape/my_dataset_test.csv")
In [30]:
# Identifies feature columns

featureCols = ["x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12"]
assembler = VectorAssembler(
    inputCols=featureCols,
    outputCol="features")
trainData = assembler.transform(train)
testData = assembler.transform(test)
print("Assembled features:")
trainData.select("features").limit(5).show(truncate=False)
Assembled features:
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                                                            |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[-2.44368823381,-5.43788699779,1.66150670287,-0.956849833578,0.259904021092,5.23602569704,-8.44911321974,-29.1561537466,0.655967982883,-2.11732026056,-4.07302790566,0.320132911086]|
|[0.434319873134,-11.5298791835,0.795999271268,-1.49726136599,1.80562213637,14.7280418269,17.4778682536,-6.86323927424,-1.12462954071,-3.09438008794,3.63444464981,0.14974488302]    |
|[-1.80336945688,-8.12718049468,-5.2076103246,-2.56761536372,-0.901936683262,-4.53511077382,10.8936427829,-27.2380317432,2.0049201664,1.02406400604,0.266727576852,-0.0576925547149] |
|[3.8739464464,14.4202262316,4.90371001903,3.89306497841,-1.51733312827,-2.62852698818,5.26815564946,-22.8780586018,-0.553033027992,-1.19253630523,0.196937471594,-0.447146972243]   |
|[0.104032570328,25.7281053013,-1.95806325824,-2.25247654694,-1.50085945776,9.72502200221,2.00463105872,-0.212221784535,-2.81444446257,-1.42155575962,4.69608242693,-0.874172570158] |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

In [5]:
# Trains a k-means model.
from pyspark.ml.clustering import KMeans
kmeans = KMeans().setK(4).setFeaturesCol("features").setMaxIter(1)
model = kmeans.fit(trainData)

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
Cluster Centers: 
[  1.11850331e-02   9.48538135e+00   2.40031500e-02   4.16465453e-02
  -2.67976127e-02  -4.83492356e-02   2.87853846e-01  -5.83766436e+00
  -6.55564876e-02  -1.73611932e-02  -1.68214346e-03   4.01573155e-04]
[ -2.82225279e-03   3.20865047e+00  -1.32994113e-01   2.86771357e-02
  -1.38673607e-02   3.98622745e-03  -5.95791725e-01   2.51498887e+01
   5.33054154e-02  -1.72607958e-02   1.02838546e-02   4.04181391e-03]
[  4.70313276e-02  -7.33529936e+00  -3.51726115e-01  -2.72206466e-01
   2.60173636e-02   2.58528515e-01   1.32298072e+00  -2.61755640e+01
   1.50789405e-01   2.51303812e-02   1.36083852e-02  -2.51200044e-02]
[ -7.78858856e-02   1.26566096e+01   3.77310376e-01   6.40909522e-01
  -8.70484368e-02  -5.54562864e-01  -3.57872575e+00  -4.27797426e+01
   2.80563373e-01   7.39894157e-02   1.99682181e-02  -2.07509279e-02]
In [54]:
# Shows the result
transformed = model.transform(testData)
transformed.selectExpr("abs(x2)","abs(x3)","prediction").limit(5).show(truncate=False)
#transformed.select("x2","x3","prediction").limit(5).show(truncate=False)
#transformed.createOrReplaceTempView("tempview")

predictions = transformed.selectExpr("abs(x1)", "prediction").groupBy("prediction").agg({'prediction': 'count'}).orderBy("prediction")
predictions.show(truncate=False)
+-------------+-------------+----------+
|abs(x2)      |abs(x3)      |prediction|
+-------------+-------------+----------+
|7.92447924573|11.3397342566|1         |
|18.657507628 |4.35878048142|0         |
|20.4394296231|2.44992761344|0         |
|5.52572149659|6.67367237155|2         |
|4.29128447152|7.55227693926|1         |
+-------------+-------------+----------+

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|0         |759              |
|1         |666              |
|2         |372              |
|3         |203              |
+----------+-----------------+

In [55]:
#print(type(predictions))
#predictions.printSchema()
data=predictions.toPandas()
print(type(data))
print(data)
<class 'pandas.core.frame.DataFrame'>
   prediction  count(prediction)
0           0                759
1           1                666
2           2                372
3           3                203
In [56]:
# Initializes charting library
import cufflinks as cf
import pandas as pd
cf.set_config_file(world_readable=True,offline=True)
In [57]:
# Plots pie chart of persona group sizes
data.iplot(kind='pie',labels='prediction',values='count(prediction)',title="Persona Size")
In [62]:
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()

# plots heatmap
df = transformed.selectExpr("abs(x1)","abs(x2)","abs(x3)","abs(x4)","abs(x5)","abs(x6)","abs(x7)","abs(x8)","abs(x9)","abs(x10)","abs(x11)","abs(x12)","prediction").groupBy("prediction").agg(
    {'abs(x1)': 'sum',
     'abs(x2)': 'sum',
     'abs(x3)': 'sum',
     'abs(x4)': 'sum',
     'abs(x5)': 'sum',
     'abs(x6)': 'sum',
     'abs(x7)': 'sum',
     'abs(x8)': 'sum',
     'abs(x9)': 'sum',
     'abs(x10)': 'sum',
     'abs(x11)': 'sum',
     'abs(x12)': 'sum'}).orderBy("prediction")

zdata = df.toPandas()
data = [go.Heatmap( z=zdata.values.tolist(), 
                   y=['Persona A', 'Persona B', 'Persona C', 'Persona D'],
                   x=['Debit Card',
                      'Personal Credit Card',
                      'Business Credit Card',
                      'Home Mortgage Loan',
                      'Auto Loan',
                      'Brokerage Account',
                      'Roth IRA',
                      '401k',
                      'Home Insurance',
                      'Automobile Insurance',
                      'Medical Insurance',
                      'Life Insurance',
                      'Cell Phone',
                      'Landline'
                     ],
                   colorscale='Viridis')]

plotly.offline.iplot(data, filename='pandas-heatmap')
In [11]:
from sklearn.datasets import make_classification, make_regression
from sklearn.externals import six
import pandas as pd
import numpy as np
import argparse
import json
import re
import os
import sys

def rename_columns(df, prefix='x'):
    """
    Rename the columns of a dataframe to have X in front of them

    :param df: data frame we're operating on
    :param prefix: the prefix string
    """
    df = df.copy()
    df.columns = [prefix + str(i) for i in df.columns]
    return df
In [12]:
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.datasets import make_gaussian_quantiles
In [13]:
# Plots 2d scatter plot. Makes sense for clustering two feature columns. Clusters may really overlap if there are more than 2 feature columns.
X, Y = make_classification(n_samples=1000, n_classes=2, n_features=2, n_redundant=0, n_informative=2,
                             scale=1000, n_clusters_per_class=1)
df = pd.DataFrame(X)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=Y)
plt.show()
print(df.head())
             0            1
0 -1692.930435  1022.020566
1  1058.489382   560.479073
2 -1037.217426  -438.327664
3  1012.040844   989.735433
4 -1017.828244  2046.095517
In [67]:
import plotly.plotly as py
import pandas as pd
X, Y = make_classification(n_samples=100, n_classes=3, n_features=3, n_redundant=0, n_informative=3,
                             scale=1000, n_clusters_per_class=1)
In [74]:
# Plots 3d scatter plot. Makes sense for clustering three feature columns. Clusters may really overlap if there are more than 3 feature columns.
df = pd.DataFrame(X)
# rename X columns
df = rename_columns(df)
# and add the Y
df['y'] = Y
# Extract out clusters to new dataframes
cluster1=df.loc[df['y'] == 0]
cluster2=df.loc[df['y'] == 1]
cluster3=df.loc[df['y'] == 2]

scatter1 = dict(
    mode = "markers",
    name = "Income",
    type = "scatter3d",    
    x = cluster1.as_matrix()[:,0], y = cluster1.as_matrix()[:,1], z = cluster1.as_matrix()[:,2],
    marker = dict( size=2, color='green')
)
scatter2 = dict(
    mode = "markers",
    name = "Spending",
    type = "scatter3d",    
    x = cluster2.as_matrix()[:,0], y = cluster2.as_matrix()[:,1], z = cluster2.as_matrix()[:,2],
    marker = dict( size=2, color='blue')
)
scatter3 = dict(
    mode = "markers",
    name = "Leverage",
    type = "scatter3d",    
    x = cluster3.as_matrix()[:,0], y = cluster3.as_matrix()[:,1], z = cluster3.as_matrix()[:,2],
    marker = dict( size=2, color='red')
)
cluster1 = dict(
    alphahull = 5,
    name = "Income",
    opacity = .1,
    type = "mesh3d",    
    x = cluster1.as_matrix()[:,0], y = cluster1.as_matrix()[:,1], z = cluster1.as_matrix()[:,2],
    color='green', showscale = True
)
cluster2 = dict(
    alphahull = 5,
    name = "Spending",
    opacity = .1,
    type = "mesh3d",    
    x = cluster2.as_matrix()[:,0], y = cluster2.as_matrix()[:,1], z = cluster2.as_matrix()[:,2],
    color='blue', showscale = True
)
cluster3 = dict(
    alphahull = 5,
    name = "Leverage",
    opacity = .1,
    type = "mesh3d",    
    x = cluster3.as_matrix()[:,0], y = cluster3.as_matrix()[:,1], z = cluster3.as_matrix()[:,2],
    color='red', showscale = True
)
layout = dict(
    title = 'Customer Segment Shapes',
    scene = dict(
        xaxis = dict( zeroline=False ),
        yaxis = dict( zeroline=False ),
        zaxis = dict( zeroline=False ),
    )
)
fig = dict( data=[scatter1, scatter2, scatter3, cluster1, cluster2, cluster3], layout=layout )
# Use py.iplot() for IPython notebook
plotly.offline.iplot(fig, filename='mesh3d_sample')
In [16]:
plt.figure(figsize=(8, 8))
plt.subplots_adjust(bottom=.05, top=.9, left=.05, right=.95)

plt.subplot(321)
plt.title("One informative feature, one cluster per class", fontsize='small')
X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=2,
                             n_clusters_per_class=1)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)

plt.subplot(322)
plt.title("Two informative features, one cluster per class", fontsize='small')
X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=1,
                             n_clusters_per_class=1)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)

plt.subplot(323)
plt.title("Two informative features, two clusters per class", fontsize='small')
X2, Y2 = make_classification(n_features=2, n_redundant=0, n_informative=2)
plt.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2)


plt.subplot(324)
plt.title("Multi-class, two informative features, one cluster",
          fontsize='small')
X1, Y1 = make_classification(n_features=2, n_redundant=0, n_informative=2,
                             n_clusters_per_class=1, n_classes=3)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)

plt.subplot(325)
plt.title("Three blobs", fontsize='small')
X1, Y1 = make_blobs(n_features=2, centers=3)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)

plt.subplot(326)
plt.title("Gaussian divided into three quantiles", fontsize='small')
X1, Y1 = make_gaussian_quantiles(n_features=2, n_classes=3)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1)

plt.show()